# The sentiment function takes a really long time so I created a new data file so you don't have to run it
us_tweets <- read_csv("us_tweets.csv") 
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   tweet_id = col_double(),
##   date = col_date(format = ""),
##   hour = col_time(format = ""),
##   user_name = col_character(),
##   nickname = col_character(),
##   bio = col_character(),
##   tweet_content = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   country = col_character(),
##   place_as_appears_on_bio = col_character(),
##   profile_picture = col_character(),
##   tweet_url = col_character()
## )
## See spec(...) for full column specifications.
#gets rid of non alphabetic characters  
us_tweets$tweet_content_stripped <- gsub("[^[:alpha:] ]", "",
                                         us_tweets$tweet_content) 


#removes all words that are 1-2 letters long
us_tweets$tweet_content_stripped <- gsub(" *\\b[[:alpha:]]{1,2}\\b *", " ",
                                         us_tweets$tweet_content_stripped) 
sentimentTotals <- data.frame(colSums(us_tweets[,c(20:27)]))

names(sentimentTotals) <- "count"

sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals),
                         sentimentTotals)

sentimentTotals
##                 sentiment count
## anger               anger 13605
## anticipation anticipation 52960
## disgust           disgust 12668
## fear                 fear 19942
## joy                   joy 46690
## sadness           sadness 21882
## surprise         surprise 22067
## trust               trust 76347
us_tweets_long <- gather(us_tweets, sentiment, count, anger:trust, 
                         factor_key = TRUE)
us_tweets$hour <- as.POSIXct(us_tweets$hour, format = " %H:%M")

ggplot(data = us_tweets, aes(x = hour)) +
  geom_histogram(stat = "count") +
  xlab("Time") + ylab("Proportion of tweets") +
  ggtitle("Number of Tweets per Hour") +
  scale_x_datetime(labels = date_format("%H:%M"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

us_tweets$charsintweet <- sapply(us_tweets$tweet_content, function(x) nchar(x))

ggplot(data = us_tweets, aes(x = charsintweet)) +
  geom_histogram(aes(fill = ..count..), binwidth = 8) +
  theme(legend.position = "none") +
  xlab("Characters per Tweet") + 
  ylab("Number of tweets") + 
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4") + 
  xlim(0,150) + 
  ggtitle("Characters per Tweet")
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + 
  ylab("Total Count") + 
  ggtitle("Total Sentiment Score for All Tweets in Sample")

tweet_words <- us_tweets %>% 
  unnest_tokens(word, tweet_content_stripped)

data(stop_words)

tweet_words <-  
  anti_join(tweet_words, stop_words)
## Joining, by = "word"
tweet_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 200, 
                 random.order = FALSE, 
                 rot.per = 0.35,  
                 colors = brewer.pal(2, "Dark2")))
## Warning in brewer.pal(2, "Dark2"): minimal value for n is 3, returning requested palette with 3 different levels

pal2 <- brewer.pal(8,"Dark2")

tweet_words %>% 
  count(word, sort = TRUE) %>% 
  top_n(10) %>% 
  mutate(word = fct_reorder(word, n)) %>% 
  ggplot(aes(x = word, y = n)) + 
  geom_bar(stat = "identity", fill = "blue", alpha = .6) + 
  coord_flip()
## Selecting by n

hashtags <- str_extract_all(us_tweets$tweet_content, "#\\S+")
hashtags <- unlist(hashtags)
hashtags <- gsub("[^[:alnum:] ]", "", hashtags)
hashtags <- tolower(hashtags)
hashtag.df <- data.frame(table(hashtags))
hashtag.df$hashtags <- as.character(hashtag.df$hashtags)
hashtag.df$Freq <- as.numeric(as.character(hashtag.df$Freq))
hashtag.df <- arrange(hashtag.df, desc(Freq))
print(hashtag.df[1:20,])
##           hashtags  Freq
## 1              job 51511
## 2           hiring 45428
## 3             jobs 21910
## 4        careerarc 20717
## 5           retail  7454
## 6      hospitality  7311
## 7          nursing  5091
## 8       healthcare  4702
## 9         veterans  4471
## 10           sales  3310
## 11              it  2179
## 12 customerservice  1927
## 13  transportation  1568
## 14           sonic  1520
## 15   manufacturing  1476
## 16           photo  1432
## 17    businessmgmt  1348
## 18      accounting  1053
## 19     engineering   970
## 20         traffic   955
#followers
us_tweets %>%
  filter(country == "US") %>% 
  mutate(text_label = str_c("followers: ", followers, '\nlocation: ', place_as_appears_on_bio)) %>%
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~followers, text = ~text_label)
#positive score
us_tweets %>%
  filter(country == "US") %>% 
  mutate(text_label = str_c("sentiment: ", positive, '\nlocation: ', place_as_appears_on_bio)) %>% 
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~positive, colors = "Set2", text = ~text_label)
#name of sentiment
us_tweets_long %>%
  filter(country == "US") %>% 
  filter(count > 0) %>% 
  mutate(text_label = str_c("sentiment: ", sentiment, '\nlocation: ', place_as_appears_on_bio)) %>% 
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~sentiment, text = ~text_label)
#how to map interactive tweets per hour?

us_tweets_long %>%
  filter(country == "US") %>% 
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~sentiment, text = ~hour)
state_tweets = us_tweets %>%
  select("longitude", "latitude")

library(sp)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(maptools)
## Checking rgeos availability: TRUE
library(gpclib)
## General Polygon Clipper Library for R (version 1.5-5)
##  Type 'class ? gpc.poly' for help
library(stringi)

latlong2state <- function(state_tweets) {
    states <- map('state', fill=TRUE, col="transparent", plot=FALSE)
    IDs <- sapply(strsplit(states$names, ":"), function(x) x[1])
    states_sp <- map2SpatialPolygons(states, IDs=IDs,
                     proj4string=CRS("+proj=longlat +datum=WGS84"))

    states_tweets_SP <- SpatialPoints(state_tweets, 
                    proj4string=CRS("+proj=longlat +datum=WGS84"))
    
    indices <- over(states_tweets_SP, states_sp)

    stateNames <- sapply(states_sp@polygons, function(x) x@ID)
    stateNames[indices]
}

us_tweets$state_name <- latlong2state(state_tweets)
us_sentiments = us_tweets %>%
  filter(country == "US") %>%
  select(c(32, 20:29)) %>%
  na.omit(state_name) %>%
  group_by(state_name) %>%
  summarise_all(funs(sum)) %>%
  mutate(positive = as.numeric(positive),
         negative = as.numeric(negative))
library(choroplethr)
## Loading required package: acs
## Loading required package: XML
## 
## Attaching package: 'acs'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:base':
## 
##     apply
library(choroplethrMaps)

us_sentiments %>%
  select("state_name", "negative") %>%
  rename(region = state_name, value = negative) %>%
  state_choropleth()
## Warning in self$bind(): The following regions were missing and are being
## set to NA: alaska, maine, hawaii

us_sentiments %>%
  select("state_name", "positive") %>%
  rename(region = state_name, value = positive) %>%
  state_choropleth()
## Warning in self$bind(): The following regions were missing and are being
## set to NA: alaska, maine, hawaii